{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir('../00_data/') # 存储数据的路径"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "atec_anti_fraud_test_b.csv       df_50w.csv        submit_0212C.csv\n",
      "atec_anti_fraud_test_b_demo.csv  df_test_50w.csv   submit_0212D.csv\n",
      "atec_anti_fraud_train.csv        df_test.zip       train_head_10w.csv\n",
      "atec_anti_fraud_train_demo.csv   submit_0211A.csv  train_modified.csv\n",
      "df_10w.csv                       submit_0212A.csv\n",
      "df_1m.csv                        submit_0212B.csv\n"
     ]
    }
   ],
   "source": [
    "ls"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# util"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_col_null_rate(df, thres = 0.2):\n",
    "    # 计算输入的df中各col null的比例\n",
    "    # input  -- df of data\n",
    "    # output -- df index and null rate\n",
    "    \n",
    "    info = pd.DataFrame()\n",
    "    info['isnull'] = df.isnull().sum()\n",
    "    info['null_ratio'] = info['isnull'] / df.shape[0]\n",
    "    info = info.sort_values(by = 'null_ratio', ascending=False)\n",
    "    info = info[info['null_ratio'] > thres]\n",
    "\n",
    "    return info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fill_col_null_with_mean(df):\n",
    "    # 用于fill df中的NA\n",
    "    # 另一种思路，直接用fillna中的阈值控制来填充，后续再改\n",
    "    col_names = list(df.columns)\n",
    "    for col in tqdm(col_names):\n",
    "        if 'f' in col:  # to exclude: id date label \n",
    "            df[col] = df[col].transform(lambda x: x.fillna(x.mean()))\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_df(df, na_ratio=0.5):\n",
    "    # na_ratio -- na比例\n",
    "    # 1. 去除无标签的行\n",
    "    # 2. 去除na比例大于na_ratio的列\n",
    "    # 3. 其他na 按均值填充\n",
    "    df_eff = df[df.label != -1]\n",
    "    na_thres = int((1-0.5)*df.shape[0])\n",
    "    df_dropna = df_eff.dropna(thresh = na_thres, axis =1)\n",
    "    df_out = fill_col_null_with_mean(df_dropna)\n",
    "\n",
    "    print('remove {} records without label.'.format(df.shape[0] - df_out.shape[0]))\n",
    "    print('remove {} features where na ratio {}.'.format(df.shape[1] - df_out.shape[1], na_ratio))\n",
    "    print(df_out.shape)\n",
    "    return df_out"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load and Clean data\n",
    "\n",
    "- 行：去除标签-1的样本\n",
    "- 列：NA比例超过0.5的列\n",
    "- 填充其他的na"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('atec_anti_fraud_train_demo.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/288 [00:00<?, ?it/s]/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  import sys\n",
      " 85%|████████▍ | 244/288 [00:17<00:03, 14.66it/s]"
     ]
    }
   ],
   "source": [
    "df_clean = clean_df(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_clean.to_csv('./df_1m.csv', index=False)  # 务必注意index=False，否则存储结果会多一列index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# convert test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test = pd.read_csv('atec_anti_fraud_test_b_demo.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_keep_col = list(df_clean.columns.values)  # 得到train data保留的col name\n",
    "train_keep_col.pop(1)  # test data中没有label 列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test = df_test.loc[:,train_keep_col]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 287/287 [00:06<00:00, 46.70it/s]\n"
     ]
    }
   ],
   "source": [
    "df_test_clean = fill_col_null_with_mean(df_test)  # 填充测试集中的null"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test_clean.to_csv('./df_test_50w.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
